#import pnadas and numpy
import pandas as pd
import numpy as np
#Load the dataset into dataframe
df_train = pd.read_csv('../data/raw/2022_train.csv')
df_test = pd.read_csv('../data/raw/2022_test.csv')
#Get the header of train dataframe
df_train.head()
| Id | GP | MIN | PTS | FGM | FGA | FG% | 3P Made | 3PA | 3P% | ... | FTA | FT% | OREB | DREB | REB | AST | STL | BLK | TOV | TARGET_5Yrs | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 3799 | 80 | 24.3 | 7.8 | 3.0 | 6.4 | 45.7 | 0.1 | 0.3 | 22.6 | ... | 2.9 | 72.1 | 2.2 | 2.0 | 3.8 | 3.2 | 1.1 | 0.2 | 1.6 | 1 |
| 1 | 3800 | 75 | 21.8 | 10.5 | 4.2 | 7.9 | 55.1 | -0.3 | -1.0 | 34.9 | ... | 3.6 | 67.8 | 3.6 | 3.7 | 6.6 | 0.7 | 0.5 | 0.6 | 1.4 | 1 |
| 2 | 3801 | 85 | 19.1 | 4.5 | 1.9 | 4.5 | 42.8 | 0.4 | 1.2 | 34.3 | ... | 0.6 | 75.7 | 0.6 | 1.8 | 2.4 | 0.8 | 0.4 | 0.2 | 0.6 | 1 |
| 3 | 3802 | 63 | 19.1 | 8.2 | 3.5 | 6.7 | 52.5 | 0.3 | 0.8 | 23.7 | ... | 1.5 | 66.9 | 0.8 | 2.0 | 3.0 | 1.8 | 0.4 | 0.1 | 1.9 | 1 |
| 4 | 3803 | 63 | 17.8 | 3.7 | 1.7 | 3.4 | 50.8 | 0.5 | 1.4 | 13.7 | ... | 0.5 | 54.0 | 2.4 | 2.7 | 4.9 | 0.4 | 0.4 | 0.6 | 0.7 | 1 |
5 rows × 21 columns
#Get the header of test dataframe
df_test.head()
| Id | GP | MIN | PTS | FGM | FGA | FG% | 3P Made | 3PA | 3P% | FTM | FTA | FT% | OREB | DREB | REB | AST | STL | BLK | TOV | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 56 | 9.1 | 4.0 | 1.6 | 3.7 | 43.7 | 0.1 | 0.3 | 7.3 | 0.7 | 1.2 | 63.4 | 1.2 | 0.8 | 1.7 | 0.4 | 0.2 | 0.3 | 0.8 |
| 1 | 1 | 43 | 19.3 | 10.1 | 3.7 | 8.1 | 46.0 | 0.6 | 1.7 | 35.1 | 1.8 | 2.5 | 75.3 | 0.5 | 0.9 | 1.5 | 3.5 | 0.6 | 0.0 | 1.8 |
| 2 | 2 | 82 | 33.9 | 11.3 | 4.9 | 10.6 | 45.6 | 0.5 | 1.9 | 44.8 | 1.8 | 2.7 | 71.2 | 1.3 | 3.3 | 4.5 | 2.5 | 1.3 | 0.3 | 2.0 |
| 3 | 3 | 86 | 44.7 | 18.8 | 6.8 | 15.9 | 42.9 | 0.5 | 1.8 | 13.5 | 4.5 | 6.3 | 70.9 | 1.5 | 3.2 | 5.0 | 4.1 | 0.9 | 0.1 | 3.6 |
| 4 | 4 | 58 | 12.3 | 4.7 | 1.6 | 4.0 | 40.0 | 0.5 | 1.7 | 38.7 | 1.1 | 1.3 | 76.9 | 0.2 | 0.6 | 0.9 | 1.5 | 0.5 | -0.4 | 0.9 |
#Get the shape of train dataframe
df_train.shape
(8000, 21)
#Get the shape of test dataframe
df_test.shape
(3799, 20)
#Get the information of train dataframe
df_train.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 8000 entries, 0 to 7999 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Id 8000 non-null int64 1 GP 8000 non-null int64 2 MIN 8000 non-null float64 3 PTS 8000 non-null float64 4 FGM 8000 non-null float64 5 FGA 8000 non-null float64 6 FG% 8000 non-null float64 7 3P Made 8000 non-null float64 8 3PA 8000 non-null float64 9 3P% 8000 non-null float64 10 FTM 8000 non-null float64 11 FTA 8000 non-null float64 12 FT% 8000 non-null float64 13 OREB 8000 non-null float64 14 DREB 8000 non-null float64 15 REB 8000 non-null float64 16 AST 8000 non-null float64 17 STL 8000 non-null float64 18 BLK 8000 non-null float64 19 TOV 8000 non-null float64 20 TARGET_5Yrs 8000 non-null int64 dtypes: float64(18), int64(3) memory usage: 1.3 MB
#Get the information of test dataframe
df_test.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3799 entries, 0 to 3798 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Id 3799 non-null int64 1 GP 3799 non-null int64 2 MIN 3799 non-null float64 3 PTS 3799 non-null float64 4 FGM 3799 non-null float64 5 FGA 3799 non-null float64 6 FG% 3799 non-null float64 7 3P Made 3799 non-null float64 8 3PA 3799 non-null float64 9 3P% 3799 non-null float64 10 FTM 3799 non-null float64 11 FTA 3799 non-null float64 12 FT% 3799 non-null float64 13 OREB 3799 non-null float64 14 DREB 3799 non-null float64 15 REB 3799 non-null float64 16 AST 3799 non-null float64 17 STL 3799 non-null float64 18 BLK 3799 non-null float64 19 TOV 3799 non-null float64 dtypes: float64(18), int64(2) memory usage: 593.7 KB
#Get the descriptive stats of train dataframe
df_train.describe()
| Id | GP | MIN | PTS | FGM | FGA | FG% | 3P Made | 3PA | 3P% | ... | FTA | FT% | OREB | DREB | REB | AST | STL | BLK | TOV | TARGET_5Yrs | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 8000.00000 | 8000.000000 | 8000.000000 | 8000.000000 | 8000.000000 | 8000.000000 | 8000.000000 | 8000.000000 | 8000.000000 | 8000.000000 | ... | 8000.000000 | 8000.000000 | 8000.000000 | 8000.000000 | 8000.000000 | 8000.000000 | 8000.000000 | 8000.000000 | 8000.000000 | 8000.000000 |
| mean | 7798.50000 | 62.777875 | 18.576662 | 7.267088 | 2.807037 | 6.231212 | 44.608900 | 0.264525 | 0.816562 | 19.583700 | ... | 1.947788 | 71.365825 | 1.077838 | 2.168500 | 3.245300 | 1.624513 | 0.648687 | 0.245212 | 1.257763 | 0.833625 |
| std | 2309.54541 | 17.118774 | 8.935263 | 4.318732 | 1.693373 | 3.584559 | 6.155453 | 0.384093 | 1.060964 | 16.003155 | ... | 1.252352 | 10.430447 | 0.785670 | 1.392224 | 2.085154 | 1.355986 | 0.407626 | 0.821037 | 0.723270 | 0.372440 |
| min | 3799.00000 | -8.000000 | 2.900000 | 0.800000 | 0.300000 | 0.800000 | 21.300000 | -1.100000 | -3.100000 | -38.500000 | ... | 0.000000 | -13.300000 | 0.000000 | 0.200000 | 0.300000 | 0.000000 | 0.000000 | -17.900000 | 0.100000 | 0.000000 |
| 25% | 5798.75000 | 51.000000 | 12.000000 | 4.100000 | 1.600000 | 3.600000 | 40.400000 | 0.000000 | 0.100000 | 8.400000 | ... | 1.000000 | 65.000000 | 0.500000 | 1.100000 | 1.700000 | 0.700000 | 0.300000 | 0.100000 | 0.700000 | 1.000000 |
| 50% | 7798.50000 | 63.000000 | 16.800000 | 6.300000 | 2.400000 | 5.400000 | 44.400000 | 0.300000 | 0.800000 | 19.500000 | ... | 1.700000 | 71.400000 | 0.900000 | 1.900000 | 2.800000 | 1.300000 | 0.600000 | 0.200000 | 1.100000 | 1.000000 |
| 75% | 9798.25000 | 74.000000 | 23.500000 | 9.500000 | 3.700000 | 8.100000 | 48.700000 | 0.500000 | 1.500000 | 30.600000 | ... | 2.600000 | 77.500000 | 1.500000 | 2.900000 | 4.300000 | 2.200000 | 0.900000 | 0.400000 | 1.600000 | 1.000000 |
| max | 11798.00000 | 123.000000 | 73.800000 | 34.200000 | 13.100000 | 28.900000 | 67.200000 | 1.700000 | 4.700000 | 82.100000 | ... | 11.100000 | 168.900000 | 5.500000 | 11.000000 | 15.900000 | 12.800000 | 3.600000 | 18.900000 | 5.300000 | 1.000000 |
8 rows × 21 columns
#Get the descriptive stats of test dataframe
df_test.describe()
| Id | GP | MIN | PTS | FGM | FGA | FG% | 3P Made | 3PA | 3P% | FTM | FTA | FT% | OREB | DREB | REB | AST | STL | BLK | TOV | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 3799.000000 | 3799.000000 | 3799.000000 | 3799.000000 | 3799.000000 | 3799.000000 | 3799.000000 | 3799.000000 | 3799.000000 | 3799.000000 | 3799.000000 | 3799.000000 | 3799.000000 | 3799.000000 | 3799.000000 | 3799.000000 | 3799.000000 | 3799.000000 | 3799.000000 | 3799.000000 |
| mean | 1899.000000 | 62.853909 | 18.650224 | 7.328034 | 2.835404 | 6.302580 | 44.599079 | 0.255962 | 0.796920 | 19.234746 | 1.399842 | 1.953567 | 71.612924 | 1.096025 | 2.179495 | 3.275783 | 1.636483 | 0.653593 | 0.257726 | 1.257910 |
| std | 1096.821164 | 17.151740 | 8.727259 | 4.294724 | 1.688427 | 3.579221 | 6.040168 | 0.380987 | 1.052862 | 15.968989 | 0.926140 | 1.250376 | 10.457336 | 0.785678 | 1.371935 | 2.070646 | 1.335496 | 0.410573 | 0.639660 | 0.712449 |
| min | 0.000000 | 6.000000 | 3.700000 | 0.700000 | 0.300000 | 0.800000 | 25.100000 | -1.000000 | -2.700000 | -38.000000 | 0.000000 | 0.000000 | 23.700000 | 0.000000 | 0.200000 | 0.300000 | 0.000000 | 0.000000 | -7.100000 | 0.100000 |
| 25% | 949.500000 | 51.000000 | 12.200000 | 4.200000 | 1.600000 | 3.700000 | 40.500000 | 0.000000 | 0.100000 | 8.500000 | 0.700000 | 1.000000 | 65.000000 | 0.500000 | 1.200000 | 1.800000 | 0.600000 | 0.400000 | 0.100000 | 0.700000 |
| 50% | 1899.000000 | 63.000000 | 17.000000 | 6.400000 | 2.500000 | 5.500000 | 44.600000 | 0.300000 | 0.800000 | 19.400000 | 1.200000 | 1.700000 | 71.500000 | 0.900000 | 1.900000 | 2.800000 | 1.300000 | 0.600000 | 0.200000 | 1.100000 |
| 75% | 2848.500000 | 74.000000 | 23.300000 | 9.400000 | 3.700000 | 8.100000 | 48.500000 | 0.500000 | 1.500000 | 30.250000 | 1.900000 | 2.600000 | 78.000000 | 1.500000 | 2.900000 | 4.300000 | 2.300000 | 0.900000 | 0.400000 | 1.600000 |
| max | 3798.000000 | 126.000000 | 68.000000 | 33.000000 | 13.400000 | 26.200000 | 74.600000 | 1.600000 | 4.300000 | 73.800000 | 7.800000 | 9.800000 | 127.100000 | 6.900000 | 12.000000 | 18.500000 | 9.000000 | 2.700000 | 14.800000 | 5.200000 |
#Confirm coefficients of train dataframe
df_train_corr = df_train.corr()
print(df_train_corr)
Id GP MIN PTS FGM FGA \
Id 1.000000 0.003940 -0.002747 0.003048 0.001812 0.000376
GP 0.003940 1.000000 0.608090 0.578344 0.577697 0.553374
MIN -0.002747 0.608090 1.000000 0.904840 0.895957 0.895085
PTS 0.003048 0.578344 0.904840 1.000000 0.989208 0.973297
FGM 0.001812 0.577697 0.895957 0.989208 1.000000 0.972670
FGA 0.000376 0.553374 0.895085 0.973297 0.972670 1.000000
FG% 0.010359 0.278510 0.234919 0.301991 0.343140 0.140998
3P Made -0.009092 0.123231 0.358597 0.337786 0.288906 0.373362
3PA -0.005847 0.119146 0.372454 0.349452 0.299300 0.399025
3P% -0.001710 0.045434 0.153846 0.156044 0.123230 0.203040
FTM 0.008232 0.524276 0.767647 0.864463 0.810273 0.787989
FTA 0.008845 0.519154 0.753660 0.842903 0.798366 0.761174
FT% -0.003565 0.155667 0.212366 0.257157 0.208038 0.263515
OREB -0.004322 0.402654 0.542782 0.539289 0.565086 0.464325
DREB -0.004517 0.492070 0.770128 0.690687 0.699456 0.635072
REB -0.004924 0.484485 0.728842 0.673621 0.689229 0.607930
AST 0.008069 0.402126 0.639013 0.573893 0.550682 0.619597
STL 0.013752 0.473409 0.759933 0.678514 0.663419 0.689391
BLK -0.005705 0.187269 0.260428 0.232923 0.245199 0.191895
TOV 0.013185 0.537821 0.792059 0.816414 0.799119 0.816810
TARGET_5Yrs 0.006645 0.242108 0.182203 0.182004 0.183568 0.162389
FG% 3P Made 3PA 3P% ... FTA FT% \
Id 0.010359 -0.009092 -0.005847 -0.001710 ... 0.008845 -0.003565
GP 0.278510 0.123231 0.119146 0.045434 ... 0.519154 0.155667
MIN 0.234919 0.358597 0.372454 0.153846 ... 0.753660 0.212366
PTS 0.301991 0.337786 0.349452 0.156044 ... 0.842903 0.257157
FGM 0.343140 0.288906 0.299300 0.123230 ... 0.798366 0.208038
FGA 0.140998 0.373362 0.399025 0.203040 ... 0.761174 0.263515
FG% 1.000000 -0.269994 -0.330586 -0.312446 ... 0.335512 -0.187200
3P Made -0.269994 1.000000 0.979340 0.585280 ... 0.088516 0.320845
3PA -0.330586 0.979340 1.000000 0.576168 ... 0.101186 0.335011
3P% -0.312446 0.585280 0.576168 1.000000 ... -0.054517 0.339894
FTM 0.279252 0.149567 0.165116 0.013036 ... 0.975611 0.274606
FTA 0.335512 0.088516 0.101186 -0.054517 ... 1.000000 0.098091
FT% -0.187200 0.320845 0.335011 0.339894 ... 0.098091 1.000000
OREB 0.539317 -0.200575 -0.213506 -0.297108 ... 0.616218 -0.190785
DREB 0.429142 0.076694 0.069782 -0.105370 ... 0.658778 -0.042316
REB 0.490902 -0.014098 -0.023208 -0.181554 ... 0.678932 -0.098987
AST -0.132006 0.375967 0.409646 0.291623 ... 0.436225 0.306042
STL 0.071364 0.305817 0.337407 0.195759 ... 0.572355 0.196138
BLK 0.288155 -0.094750 -0.101143 -0.169376 ... 0.261547 -0.150871
TOV 0.128622 0.231413 0.255422 0.095169 ... 0.762837 0.179152
TARGET_5Yrs 0.158858 0.015016 0.003463 -0.011977 ... 0.176525 0.039429
OREB DREB REB AST STL BLK \
Id -0.004322 -0.004517 -0.004924 0.008069 0.013752 -0.005705
GP 0.402654 0.492070 0.484485 0.402126 0.473409 0.187269
MIN 0.542782 0.770128 0.728842 0.639013 0.759933 0.260428
PTS 0.539289 0.690687 0.673621 0.573893 0.678514 0.232923
FGM 0.565086 0.699456 0.689229 0.550682 0.663419 0.245199
FGA 0.464325 0.635072 0.607930 0.619597 0.689391 0.191895
FG% 0.539317 0.429142 0.490902 -0.132006 0.071364 0.288155
3P Made -0.200575 0.076694 -0.014098 0.375967 0.305817 -0.094750
3PA -0.213506 0.069782 -0.023208 0.409646 0.337407 -0.101143
3P% -0.297108 -0.105370 -0.181554 0.291623 0.195759 -0.169376
FTM 0.544742 0.619711 0.625154 0.486761 0.591616 0.221537
FTA 0.616218 0.658778 0.678932 0.436225 0.572355 0.261547
FT% -0.190785 -0.042316 -0.098987 0.306042 0.196138 -0.150871
OREB 1.000000 0.778602 0.897889 -0.003037 0.290161 0.430952
DREB 0.778602 1.000000 0.970105 0.223116 0.437507 0.441530
REB 0.897889 0.970105 1.000000 0.149737 0.408584 0.458246
AST -0.003037 0.223116 0.149737 1.000000 0.737032 -0.084823
STL 0.290161 0.437507 0.408584 0.737032 1.000000 0.095003
BLK 0.430952 0.441530 0.458246 -0.084823 0.095003 1.000000
TOV 0.378683 0.548209 0.511051 0.738581 0.707753 0.144671
TARGET_5Yrs 0.172604 0.168065 0.175627 0.096771 0.128381 0.098642
TOV TARGET_5Yrs
Id 0.013185 0.006645
GP 0.537821 0.242108
MIN 0.792059 0.182203
PTS 0.816414 0.182004
FGM 0.799119 0.183568
FGA 0.816810 0.162389
FG% 0.128622 0.158858
3P Made 0.231413 0.015016
3PA 0.255422 0.003463
3P% 0.095169 -0.011977
FTM 0.771939 0.173913
FTA 0.762837 0.176525
FT% 0.179152 0.039429
OREB 0.378683 0.172604
DREB 0.548209 0.168065
REB 0.511051 0.175627
AST 0.738581 0.096771
STL 0.707753 0.128381
BLK 0.144671 0.098642
TOV 1.000000 0.151240
TARGET_5Yrs 0.151240 1.000000
[21 rows x 21 columns]
import matplotlib.pyplot as plt
import seaborn as sns
sns.pairplot(df_train, hue='TARGET_5Yrs')
plt.show()